library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages -------------------------------------------------------------------- tidyverse 1.3.1 --
√ ggplot2 3.3.5 √ purrr 0.3.4
√ tibble 3.1.6 √ dplyr 1.0.8
√ tidyr 1.2.0 √ stringr 1.4.0
√ readr 2.1.2 √ forcats 0.5.1
-- Conflicts ----------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(readxl)
Warning: package ‘readxl’ was built under R version 4.1.3
candy_2015 <- read_excel("../raw_data/boing-boing-candy-2015.xlsx")
candy_2016 <- read_excel("../raw_data/boing-boing-candy-2016.xlsx")
candy_2017 <- read_excel("../raw_data/boing-boing-candy-2017.xlsx")
New names:
* `` -> ...114
here::here()
[1] "C:/Users/mahri/OneDrive/CodeClan/dirty_data_project/dirty_data_codeclan_project_mahri/dirty_data_task_4_mahri"
From glimpse
head(candy_2015)
glimpse(candy_2016)
Rows: 1,259
Columns: 123
$ Timestamp <dttm> ~
$ `Are you going actually going trick or treating yourself?` <chr> ~
$ `Your gender:` <chr> ~
$ `How old are you?` <chr> ~
$ `Which country do you live in?` <chr> ~
$ `Which state, province, county do you live in?` <chr> ~
$ `[100 Grand Bar]` <chr> ~
$ `[Anonymous brown globs that come in black and orange wrappers]` <chr> ~
$ `[Any full-sized candy bar]` <chr> ~
$ `[Black Jacks]` <chr> ~
$ `[Bonkers (the candy)]` <chr> ~
$ `[Bonkers (the board game)]` <chr> ~
$ `[Bottle Caps]` <chr> ~
$ `[Box'o'Raisins]` <chr> ~
$ `[Broken glow stick]` <chr> ~
$ `[Butterfinger]` <chr> ~
$ `[Cadbury Creme Eggs]` <chr> ~
$ `[Candy Corn]` <chr> ~
$ `[Candy that is clearly just the stuff given out for free at restaurants]` <chr> ~
$ `[Caramellos]` <chr> ~
$ `[Cash, or other forms of legal tender]` <chr> ~
$ `[Chardonnay]` <chr> ~
$ `[Chick-o-Sticks (we don’t know what that is)]` <chr> ~
$ `[Chiclets]` <chr> ~
$ `[Coffee Crisp]` <chr> ~
$ `[Creepy Religious comics/Chick Tracts]` <chr> ~
$ `[Dental paraphenalia]` <chr> ~
$ `[Dots]` <chr> ~
$ `[Dove Bars]` <chr> ~
$ `[Fuzzy Peaches]` <chr> ~
$ `[Generic Brand Acetaminophen]` <chr> ~
$ `[Glow sticks]` <chr> ~
$ `[Goo Goo Clusters]` <chr> ~
$ `[Good N' Plenty]` <chr> ~
$ `[Gum from baseball cards]` <chr> ~
$ `[Gummy Bears straight up]` <chr> ~
$ `[Hard Candy]` <chr> ~
$ `[Healthy Fruit]` <chr> ~
$ `[Heath Bar]` <chr> ~
$ `[Hershey's Dark Chocolate]` <chr> ~
$ `[Hershey’s Milk Chocolate]` <chr> ~
$ `[Hershey's Kisses]` <chr> ~
$ `[Hugs (actual physical hugs)]` <chr> ~
$ `[Jolly Rancher (bad flavor)]` <chr> ~
$ `[Jolly Ranchers (good flavor)]` <chr> ~
$ `[JoyJoy (Mit Iodine!)]` <chr> ~
$ `[Junior Mints]` <chr> ~
$ `[Senior Mints]` <chr> ~
$ `[Kale smoothie]` <chr> ~
$ `[Kinder Happy Hippo]` <chr> ~
$ `[Kit Kat]` <chr> ~
$ `[LaffyTaffy]` <chr> ~
$ `[LemonHeads]` <chr> ~
$ `[Licorice (not black)]` <chr> ~
$ `[Licorice (yes black)]` <chr> ~
$ `[Lindt Truffle]` <chr> ~
$ `[Lollipops]` <chr> ~
$ `[Mars]` <chr> ~
$ `[Mary Janes]` <chr> ~
$ `[Maynards]` <chr> ~
$ `[Mike and Ike]` <chr> ~
$ `[Milk Duds]` <chr> ~
$ `[Milky Way]` <chr> ~
$ `[Regular M&Ms]` <chr> ~
$ `[Peanut M&M’s]` <chr> ~
$ `[Blue M&M's]` <chr> ~
$ `[Red M&M's]` <chr> ~
$ `[Third Party M&M's]` <chr> ~
$ `[Minibags of chips]` <chr> ~
$ `[Mint Kisses]` <chr> ~
$ `[Mint Juleps]` <chr> ~
$ `[Mr. Goodbar]` <chr> ~
$ `[Necco Wafers]` <chr> ~
$ `[Nerds]` <chr> ~
$ `[Nestle Crunch]` <chr> ~
$ `[Now'n'Laters]` <chr> ~
$ `[Peeps]` <chr> ~
$ `[Pencils]` <chr> ~
$ `[Person of Interest Season 3 DVD Box Set (not including Disc 4 with hilarious outtakes)]` <chr> ~
$ `[Pixy Stix]` <chr> ~
$ `[Reese’s Peanut Butter Cups]` <chr> ~
$ `[Reese's Pieces]` <chr> ~
$ `[Reggie Jackson Bar]` <chr> ~
$ `[Rolos]` <chr> ~
$ `[Skittles]` <chr> ~
$ `[Smarties (American)]` <chr> ~
$ `[Smarties (Commonwealth)]` <chr> ~
$ `[Snickers]` <chr> ~
$ `[Sourpatch Kids (i.e. abominations of nature)]` <chr> ~
$ `[Spotted Dick]` <chr> ~
$ `[Starburst]` <chr> ~
$ `[Sweet Tarts]` <chr> ~
$ `[Swedish Fish]` <chr> ~
$ `[Sweetums (a friend to diabetes)]` <chr> ~
$ `[Tic Tacs]` <chr> ~
$ `[Those odd marshmallow circus peanut things]` <chr> ~
$ `[Three Musketeers]` <chr> ~
$ `[Tolberone something or other]` <chr> ~
$ `[Trail Mix]` <chr> ~
$ `[Twix]` <chr> ~
$ `[Vials of pure high fructose corn syrup, for main-lining into your vein]` <chr> ~
$ `[Vicodin]` <chr> ~
$ `[Whatchamacallit Bars]` <chr> ~
$ `[White Bread]` <chr> ~
$ `[Whole Wheat anything]` <chr> ~
$ `[York Peppermint Patties]` <chr> ~
$ `Please list any items not included above that give you JOY.` <chr> ~
$ `Please list any items not included above that give you DESPAIR.` <chr> ~
$ `Please leave any witty, snarky or thoughtful remarks or comments regarding your choices.` <chr> ~
$ `Guess the number of mints in my hand.` <chr> ~
$ `Betty or Veronica?` <chr> ~
$ `"That dress* that went viral a few years back - when I first saw it, it was ________"` <chr> ~
$ `What is your favourite font?` <chr> ~
$ `Please estimate the degree(s) of separation you have from the following celebrities [JK Rowling]` <chr> ~
$ `Please estimate the degree(s) of separation you have from the following celebrities [JJ Abrams]` <chr> ~
$ `Please estimate the degree(s) of separation you have from the following celebrities [Beyoncé]` <chr> ~
$ `Please estimate the degree(s) of separation you have from the following celebrities [Bieber]` <chr> ~
$ `Please estimate the degree(s) of separation you have from the following celebrities [Kevin Bacon]` <chr> ~
$ `Please estimate the degree(s) of separation you have from the following celebrities [Francis Bacon (1561 - 1626)]` <chr> ~
$ `Which day do you prefer, Friday or Sunday?` <chr> ~
$ `Do you eat apples the correct way, East to West (side to side) or do you eat them like a freak of nature, South to North (bottom to top)?` <chr> ~
$ `When you see the above image of the 4 different websites, which one would you most likely check out (please be honest).` <chr> ~
$ `[York Peppermint Patties] Ignore` <lgl> ~
glimpse(candy_2017)
Rows: 2,460
Columns: 120
$ `Internal ID` <dbl> 90258773, ~
$ `Q1: GOING OUT?` <chr> NA, "No", ~
$ `Q2: GENDER` <chr> NA, "Male"~
$ `Q3: AGE` <chr> NA, "44", ~
$ `Q4: COUNTRY` <chr> NA, "USA",~
$ `Q5: STATE, PROVINCE, COUNTY, ETC` <chr> NA, "NM", ~
$ `Q6 | 100 Grand Bar` <chr> NA, "MEH",~
$ `Q6 | Anonymous brown globs that come in black and orange wrappers\t(a.k.a. Mary Janes)` <chr> NA, "DESPA~
$ `Q6 | Any full-sized candy bar` <chr> NA, "JOY",~
$ `Q6 | Black Jacks` <chr> NA, "MEH",~
$ `Q6 | Bonkers (the candy)` <chr> NA, "DESPA~
$ `Q6 | Bonkers (the board game)` <chr> NA, "DESPA~
$ `Q6 | Bottle Caps` <chr> NA, "DESPA~
$ `Q6 | Box'o'Raisins` <chr> NA, "DESPA~
$ `Q6 | Broken glow stick` <chr> NA, "DESPA~
$ `Q6 | Butterfinger` <chr> NA, "DESPA~
$ `Q6 | Cadbury Creme Eggs` <chr> NA, "MEH",~
$ `Q6 | Candy Corn` <chr> NA, "MEH",~
$ `Q6 | Candy that is clearly just the stuff given out for free at restaurants` <chr> NA, "DESPA~
$ `Q6 | Caramellos` <chr> NA, "MEH",~
$ `Q6 | Cash, or other forms of legal tender` <chr> NA, "JOY",~
$ `Q6 | Chardonnay` <chr> NA, "MEH",~
$ `Q6 | Chick-o-Sticks (we don’t know what that is)` <chr> NA, "DESPA~
$ `Q6 | Chiclets` <chr> NA, "DESPA~
$ `Q6 | Coffee Crisp` <chr> NA, "DESPA~
$ `Q6 | Creepy Religious comics/Chick Tracts` <chr> NA, "DESPA~
$ `Q6 | Dental paraphenalia` <chr> NA, "DESPA~
$ `Q6 | Dots` <chr> NA, "MEH",~
$ `Q6 | Dove Bars` <chr> NA, "JOY",~
$ `Q6 | Fuzzy Peaches` <chr> NA, "DESPA~
$ `Q6 | Generic Brand Acetaminophen` <chr> NA, "DESPA~
$ `Q6 | Glow sticks` <chr> NA, "DESPA~
$ `Q6 | Goo Goo Clusters` <chr> NA, "DESPA~
$ `Q6 | Good N' Plenty` <chr> NA, "MEH",~
$ `Q6 | Gum from baseball cards` <chr> NA, "DESPA~
$ `Q6 | Gummy Bears straight up` <chr> NA, "MEH",~
$ `Q6 | Hard Candy` <chr> NA, "MEH",~
$ `Q6 | Healthy Fruit` <chr> NA, "DESPA~
$ `Q6 | Heath Bar` <chr> NA, "MEH",~
$ `Q6 | Hershey's Dark Chocolate` <chr> NA, "JOY",~
$ `Q6 | Hershey’s Milk Chocolate` <chr> NA, "JOY",~
$ `Q6 | Hershey's Kisses` <chr> NA, "MEH",~
$ `Q6 | Hugs (actual physical hugs)` <chr> NA, "DESPA~
$ `Q6 | Jolly Rancher (bad flavor)` <chr> NA, "DESPA~
$ `Q6 | Jolly Ranchers (good flavor)` <chr> NA, "MEH",~
$ `Q6 | JoyJoy (Mit Iodine!)` <chr> NA, "DESPA~
$ `Q6 | Junior Mints` <chr> NA, "DESPA~
$ `Q6 | Senior Mints` <chr> NA, "DESPA~
$ `Q6 | Kale smoothie` <chr> NA, "DESPA~
$ `Q6 | Kinder Happy Hippo` <chr> NA, "DESPA~
$ `Q6 | Kit Kat` <chr> NA, "JOY",~
$ `Q6 | LaffyTaffy` <chr> NA, "DESPA~
$ `Q6 | LemonHeads` <chr> NA, "MEH",~
$ `Q6 | Licorice (not black)` <chr> NA, "MEH",~
$ `Q6 | Licorice (yes black)` <chr> NA, "JOY",~
$ `Q6 | Lindt Truffle` <chr> NA, "MEH",~
$ `Q6 | Lollipops` <chr> NA, "DESPA~
$ `Q6 | Mars` <chr> NA, "DESPA~
$ `Q6 | Maynards` <chr> NA, "DESPA~
$ `Q6 | Mike and Ike` <chr> NA, "MEH",~
$ `Q6 | Milk Duds` <chr> NA, "MEH",~
$ `Q6 | Milky Way` <chr> NA, "JOY",~
$ `Q6 | Regular M&Ms` <chr> NA, "JOY",~
$ `Q6 | Peanut M&M’s` <chr> NA, "MEH",~
$ `Q6 | Blue M&M's` <chr> NA, "JOY",~
$ `Q6 | Red M&M's` <chr> NA, "JOY",~
$ `Q6 | Green Party M&M's` <chr> NA, "JOY",~
$ `Q6 | Independent M&M's` <chr> NA, "JOY",~
$ `Q6 | Abstained from M&M'ing.` <chr> NA, "DESPA~
$ `Q6 | Minibags of chips` <chr> NA, "DESPA~
$ `Q6 | Mint Kisses` <chr> NA, "MEH",~
$ `Q6 | Mint Juleps` <chr> NA, "DESPA~
$ `Q6 | Mr. Goodbar` <chr> NA, "DESPA~
$ `Q6 | Necco Wafers` <chr> NA, "DESPA~
$ `Q6 | Nerds` <chr> NA, "DESPA~
$ `Q6 | Nestle Crunch` <chr> NA, "JOY",~
$ `Q6 | Now'n'Laters` <chr> NA, "DESPA~
$ `Q6 | Peeps` <chr> NA, "DESPA~
$ `Q6 | Pencils` <chr> NA, "DESPA~
$ `Q6 | Pixy Stix` <chr> NA, "DESPA~
$ `Q6 | Real Housewives of Orange County Season 9 Blue-Ray` <chr> NA, "DESPA~
$ `Q6 | Reese’s Peanut Butter Cups` <chr> NA, "JOY",~
$ `Q6 | Reese's Pieces` <chr> NA, "JOY",~
$ `Q6 | Reggie Jackson Bar` <chr> NA, "DESPA~
$ `Q6 | Rolos` <chr> NA, "JOY",~
$ `Q6 | Sandwich-sized bags filled with BooBerry Crunch` <chr> NA, "DESPA~
$ `Q6 | Skittles` <chr> NA, "DESPA~
$ `Q6 | Smarties (American)` <chr> NA, "DESPA~
$ `Q6 | Smarties (Commonwealth)` <chr> NA, "DESPA~
$ `Q6 | Snickers` <chr> NA, "MEH",~
$ `Q6 | Sourpatch Kids (i.e. abominations of nature)` <chr> NA, "DESPA~
$ `Q6 | Spotted Dick` <chr> NA, "DESPA~
$ `Q6 | Starburst` <chr> NA, "MEH",~
$ `Q6 | Sweet Tarts` <chr> NA, "DESPA~
$ `Q6 | Swedish Fish` <chr> NA, "MEH",~
$ `Q6 | Sweetums (a friend to diabetes)` <chr> NA, "DESPA~
$ `Q6 | Take 5` <chr> NA, "DESPA~
$ `Q6 | Tic Tacs` <chr> NA, "DESPA~
$ `Q6 | Those odd marshmallow circus peanut things` <chr> NA, "DESPA~
$ `Q6 | Three Musketeers` <chr> NA, "JOY",~
$ `Q6 | Tolberone something or other` <chr> NA, "JOY",~
$ `Q6 | Trail Mix` <chr> NA, "DESPA~
$ `Q6 | Twix` <chr> NA, "JOY",~
$ `Q6 | Vials of pure high fructose corn syrup, for main-lining into your vein` <chr> NA, "DESPA~
$ `Q6 | Vicodin` <chr> NA, "DESPA~
$ `Q6 | Whatchamacallit Bars` <chr> NA, "DESPA~
$ `Q6 | White Bread` <chr> NA, "DESPA~
$ `Q6 | Whole Wheat anything` <chr> NA, "DESPA~
$ `Q6 | York Peppermint Patties` <chr> NA, "DESPA~
$ `Q7: JOY OTHER` <chr> NA, "Mound~
$ `Q8: DESPAIR OTHER` <chr> NA, NA, NA~
$ `Q9: OTHER COMMENTS` <chr> NA, "Botto~
$ `Q10: DRESS` <chr> NA, "White~
$ ...114 <chr> NA, NA, NA~
$ `Q11: DAY` <chr> NA, "Sunda~
$ `Q12: MEDIA [Daily Dish]` <dbl> NA, NA, NA~
$ `Q12: MEDIA [Science]` <dbl> NA, 1, NA,~
$ `Q12: MEDIA [ESPN]` <dbl> NA, NA, NA~
$ `Q12: MEDIA [Yahoo]` <dbl> NA, NA, NA~
$ `Click Coordinates (x, y)` <chr> NA, "(84, ~
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
janitor_candy_2015 <- janitor::clean_names(candy_2015)
janitor_candy_2015
janitor_candy_2016 <- janitor::clean_names(candy_2016)
janitor_candy_2016
janitor_candy_2017 <- janitor::clean_names(candy_2017)
janitor_candy_2017
just looking at who is reporting back about these ones…
janitor_candy_2017 %>%
select(q3_age, q2_gender, q6_independent_m_ms, q6_green_party_m_ms)
janitor_candy_2016 %>%
select(york_peppermint_patties_ignore)
#nobody
REMOVE AND RENAME
Step 1 - Remove and add for each year
Step 2 - RENAME FOR EACH YEAR
2015 REMOVE AND ADD
# names(janitor_candy_2015)
col_removed_candy_2015 <- janitor_candy_2015 %>%
select(-c(116:124), -c(97:113), -c(93:95), -c(90, 91),
-c(peterson_brand_sidewalk_chalk, spotted_dick, mint_leaves,
joy_joy_mit_iodine, minibags_of_chips, lapel_pins, kale_smoothie,
hugs_actual_physical_hugs, heath_bar, healthy_fruit,
creepy_religious_comics_chick_tracts, broken_glow_stick,
glow_sticks, generic_brand_acetaminophen, dental_paraphenalia,
cash_or_other_forms_of_legal_tender,
vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein,
box_o_raisins, timestamp)) %>%
add_column(year = "2015", .before = 1) %>%
mutate(id_number = row_number(), .before = 2)
col_removed_candy_2015
#view(col_removed_candy_2015)
2015 RENAME
2016 - REMOVE AND ADD
col_removed_candy_2016 <- janitor_candy_2016 %>%
select(-c(104, 105, 107:123),
-c(vicodin, vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein,
trail_mix, spotted_dick,
person_of_interest_season_3_dvd_box_set_not_including_disc_4_with_hilarious_outtakes,
minibags_of_chips, kale_smoothie, joy_joy_mit_iodine, hugs_actual_physical_hugs,
heath_bar, healthy_fruit, glow_sticks, generic_brand_acetaminophen,
dental_paraphenalia, creepy_religious_comics_chick_tracts, chardonnay,
cash_or_other_forms_of_legal_tender, broken_glow_stick, boxo_raisins,
bonkers_the_board_game, timestamp)) %>%
add_column(year = "2016", .before = 1) %>%
mutate(id_number = max(candy_2015_renamed$id_number) + row_number(), .before = 2)
col_removed_candy_2016
# mutate(person_id = max(candy_2015_clean$person_id) + row_number()) %>%
2016 RENAME
# Again checking on Mary Janes column differences
# col_removed_candy_2016 %>%
# select(anonymous_brown_globs_that_come_in_black_and_orange_wrappers, mary_janes)
candy_2016_renamed <- col_removed_candy_2016 %>%
rename(trick_or_treating =
are_you_going_actually_going_trick_or_treating_yourself,
gender = your_gender,
age = how_old_are_you,
country = which_country_do_you_live_in,
state_or_prov = which_state_province_county_do_you_live_in,
anonymous_black_and_orange_wrapper =
anonymous_brown_globs_that_come_in_black_and_orange_wrappers,
bonkers = bonkers_the_candy,
restaurant_candy =
candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants,
chick_o_stick = chick_o_sticks_we_don_t_know_what_that_is,
gummy_bears = gummy_bears_straight_up,
hersheys_milk_chocolate = hershey_s_milk_chocolate,
licorice_black = licorice_yes_black,
peanut_m_ms = peanut_m_m_s,
party_bag_m_ms = third_party_m_ms,
reeses_peanut_butter_cups = reese_s_peanut_butter_cups,
sourpatch_kids = sourpatch_kids_i_e_abominations_of_nature,
sweetarts = sweet_tarts,
sweetums = sweetums_a_friend_to_diabetes,
circus_peanuts = those_odd_marshmallow_circus_peanut_things,
toblerone = tolberone_something_or_other)
candy_2016_renamed
NA
2017 CLEAN AND ADD Note - should have removed the “q6_” before this, but have done so in the next chunk
col_removed_candy_2017 <- janitor_candy_2017 %>%
select(-c(102, 104, 105, 107, 108, 110:120),
-c(q6_spotted_dick,
q6_sandwich_sized_bags_filled_with_boo_berry_crunch,
q6_real_housewives_of_orange_county_season_9_blue_ray,
q6_minibags_of_chips,
q6_abstained_from_m_ming,
q6_kale_smoothie, q6_joy_joy_mit_iodine,
q6_hugs_actual_physical_hugs,
q6_heath_bar,
q6_healthy_fruit,
q6_glow_sticks,
q6_generic_brand_acetaminophen,
q6_dental_paraphenalia,
q6_creepy_religious_comics_chick_tracts,
q6_chardonnay,
q6_cash_or_other_forms_of_legal_tender,
q6_broken_glow_stick,
q6_boxo_raisins,
q6_bonkers_the_board_game,
internal_id)) %>%
add_column(year = "2017", .before = 1) %>%
mutate(id_number = max(candy_2016_renamed$id_number) + row_number(), .before = 2)
col_removed_candy_2017
2017 RENAME - get rid of “q1/2/3/4/5/6” at the start of col names and rename to match 2015 and 16
candy_2017_q_removed <- col_removed_candy_2017 %>%
rename_all(~ sub("^[q0-9]{2}_", "",
make.names(names(col_removed_candy_2017))))
candy_2017_renamed <- candy_2017_q_removed %>%
rename(trick_or_treating = going_out,
state_or_prov = state_province_county_etc,
x100_grand_bar = `100_grand_bar`,
mary_janes =
anonymous_brown_globs_that_come_in_black_and_orange_wrappers_a_k_a_mary_janes,
bonkers = bonkers_the_candy,
restaurant_candy =
candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants,
chick_o_stick = chick_o_sticks_we_don_t_know_what_that_is,
gummy_bears = gummy_bears_straight_up,
hersheys_milk_chocolate = hershey_s_milk_chocolate,
licorice_black = licorice_yes_black,
peanut_m_ms = peanut_m_m_s,
green_m_ms = green_party_m_ms,
lone_m_ms = independent_m_ms,
reeses_peanut_butter_cups = reese_s_peanut_butter_cups,
sourpatch_kids = sourpatch_kids_i_e_abominations_of_nature,
sweetarts = sweet_tarts,
sweetums = sweetums_a_friend_to_diabetes,
circus_peanuts = those_odd_marshmallow_circus_peanut_things,
toblerone = tolberone_something_or_other)
candy_2017_renamed
view(candy_2015_renamed)
view(candy_2016_renamed)
view(candy_2017_renamed)
Getting an idea of people’s responses:
distinct(candy_2015_renamed, age)
#(chr) 146 responses, some silly and some strange
distinct(candy_2015_renamed, trick_or_treating)
#(chr) yes or no (NAs in 2017)
distinct(candy_2015_renamed, starburst)
distinct(candy_2016_renamed, age)
# (chr) 98 incl silly/strange
distinct(candy_2016_renamed, trick_or_treating)
#(chr) Yes No (NAs in 2017)
distinct(candy_2016_renamed, gender)
# Male, Female, Other, I'd rather not say, NA
distinct(candy_2016_renamed, country)
#93 some silly, some e.g. USA, US, us, u.s.a. etc
distinct(candy_2016_renamed, starburst)
distinct(candy_2017_renamed, age)
#(chr) 107 incl silly/strange
distinct(candy_2017_renamed, trick_or_treating)
#(chr) Yes No and NA
distinct(candy_2017_renamed, gender)
#Male, Female, Other, I'd rather not say, NA
distinct(candy_2017_renamed, country)
#118 some silly, some e.g. USA, US, us, u.s.a. etc
distinct(candy_2017_renamed, starburst)
Joining all three years by binding rows so as to keep everything
bound_candy <- bind_rows(candy_2015_renamed,
candy_2016_renamed,
candy_2017_renamed)
bound_candy <- bound_candy %>%
relocate(country, .before = 5) %>%
relocate(state_or_prov, .before = 6) %>%
relocate(gender, .before = 7)
# view(bound_candy)
QUESTION 1 What is the total number of candy ratings given across the three years. (Number of candy ratings, not the number of raters. Don’t count missing values)
ANSWER - 590,010 ratings
AGE CLEANING Age is a character column with 274 values. They are numbers, NAs, and strange and silly values - changed age column to a numeric but it output strange figures, so I specified integer and strange answers became NAs. - Seems unlikely that anyone > 100 years old is answering so I removed them
# bound_candy %>%
# distinct(age)
bound_age_to_numeric <- bound_candy %>%
mutate(age = as.integer(age)) %>%
arrange(age)
Warning: Problem while computing `age = as.integer(age)`.
i NAs introduced by coercion
Warning: Problem while computing `age = as.integer(age)`.
i NAs introduced by coercion to integer range
# bound_age_to_numeric %>%
# distinct(age)
bound_age_cleaning <- bound_age_to_numeric %>%
mutate(age = if_else(age > 99, NA_integer_, age))
bound_age_cleaning
# view(bound_age_cleaning)
QUESTION TWO What was the average age of people who are going out trick or treating?
I know the responses for trick or treating are “Yes” “No” and “NA”
ANSWER - The (rounded) average age of those going trick or treating is 35 years old (34.94897 is the unrounded value)
bound_age_cleaning %>%
select(age, trick_or_treating) %>%
group_by(trick_or_treating) %>%
summarise(average_age = round(mean(age, na.rm = TRUE)))
# to just get the answer for Yes on it's own:
bound_age_cleaning %>%
select(age, trick_or_treating) %>%
group_by(trick_or_treating) %>%
filter(trick_or_treating == "Yes") %>%
summarise(average_age = round(mean(age, na.rm = TRUE)))
NA
QUESTION THREE What was the average age of people who are not going trick or treating? ANSWER - The (rounded) average age of those not going trick or treating is 39 years old (39.10454 is the unrounded value)
bound_age_cleaning %>%
select(age, trick_or_treating) %>%
group_by(trick_or_treating) %>%
summarise(average_age = round(mean(age, na.rm = TRUE)))
# to just get the answer for No on it's own:
bound_age_cleaning %>%
select(age, trick_or_treating) %>%
group_by(trick_or_treating) %>%
filter(trick_or_treating == "No") %>%
summarise(average_age = round(mean(age, na.rm = TRUE)))
NA
QUESTION FOUR For each of joy, despair and meh, which candy bar revived the most of these ratings?
As in question 1, a 2 column tibble with candy and rating was created and a count of each distinct answer was made - firstly counting each for each response and each candy, then filtering to find the maximum count for each rating.
ANSWER -
Despair: gum that comes with baseball cards returned the most despair
responses with 7,341 Joy:Full sized candy bars made the most people
joyful with 7,589 responses. This seems very generic so I ran it again
to remove the full sized candy bars and the top Joy response was: 7369
responses for reeses peanut butter cups Meh: 1,570 “Meh” responses were
given for lollipops
rated_candy <- bound_candy %>%
select(-c(year, id_number, age,
trick_or_treating, country,
state_or_prov, gender, any_full_sized_candy_bar)) %>%
pivot_longer(butterfinger:take_5,
names_to = "candy",
values_to = "rating")
rated_candy
rated_candy %>%
group_by(rating, candy) %>%
summarise(number_of_ratings = n()) %>%
filter(number_of_ratings == max(number_of_ratings))
`summarise()` has grouped output by 'rating'. You can override using the `.groups` argument.
COUNTRY CLEANING
Firstly getting an idea of NAs and distinct country values:
Note that 2015 (5630 rows of 9349) has no country data… (all NA)
5715 rows in bound_candy has NA
There are 169 distinct country names including NAs, miss-spellings, and silly/ unknown answers
Using stringr and regex to reduce “country” values - For columns that were numbers or clearly fake (silly) answers, I checked the state or province column to see if there was a match to a country - I checked my work one row at a time to try and ensure I did not change anything that was not meant to be changed. - I am sure there are faster/ better ways of doing this, but I was practicing different options
library(stringr)
country = str_replace_all(country, pattern = “[ ][uU]+[ .!][sS]+[ .!][aA][ .!]*“,”States”) tried this to change what was now States USA USA but it changed that to StatesStatesStates and also Australia to AStatestralia…
bound_country_clean <- bound_candy %>%
mutate(country = str_replace_all(country, pattern = "[0-9][0-9][.][0-9]", "States"),
country = str_replace_all(country, pattern = "[3|4|5][0-9]", "States"),
country = str_replace_all(country, pattern = "^[ ]*[uU]+[ .!]*[sS]+[ .!]*[aA]*[ .!]*", "States"),
country = str_replace_all(country, pattern = "^[uU][nited]+\\s[sS][tT]*[aA|eE|sS][tT][eE][sS|aA]", "States"),
country = str_replace_all(country, pattern = "^[uU][nN][iI][tT][eE][dD|sS]+\\s[sS][tT][aA][tT][eE|sS][sS]*", "States"),
# the above doesn't work for all... i presume there are spaces somewhere
country = str_replace_all(country, pattern = "^[uU][nN][iI][tT][sS]+\\s[sS][tT][aA][tT][eE][sS]*", "States"),
country = str_replace_all(country, pattern = "[sS][tT][aA|eE][tT]*[eE][sS]*", "States"),
country = str_replace_all(country, pattern = "[uU]+[sS]+[aA]+", "States"),
country = str_replace_all(country, pattern = "^[mM][uU|eE][rR]+[iI][cC|kK][aA]", "States"),
country = str_replace_all(country, pattern = "^[aA][mM][eE][rR][iI][cC][aA]", "States"),
country = str_replace_all(country, pattern = "^\\'[mM][uU|eE][rR][iI][cC][aA]", "States"),
country = str_replace_all(country, pattern = "[sS][tT][aA][tT][eE][sS][!]", "States"),
country = str_replace_all(country, pattern = "[a-zA-Z]+ [-]+ [uU][sS][aA]", "States"),
# the above only changed "the best one - usa" to "the best States"
country = str_replace_all(country, pattern = "[of] [aA][merica]", ""),
# changes States of America to States oerica and i also have united states oerica
country = str_replace_all(country, pattern = "cascadia", "States"),
country = str_replace_all(country, pattern = "Narnia", "States"),
country = str_replace_all(country, pattern = "Sub-Canadian North America... 'Merica", "States"),
country = str_replace_all(country, pattern = "Trumpistan", "States"),
country = str_replace_all(country, pattern = "The republic of Cascadia", "States"),
country = str_replace_all(country, pattern = "unhinged ", ""),
country = str_replace_all(country, pattern = "North Carolina", "States"),
country = str_replace_all(country, pattern = "Pittsburgh", "States"),
country = str_replace_all(country, pattern = "New York", "States"),
country = str_replace_all(country, pattern = "Ahem....Amerca", "States"),
country = str_replace_all(country, pattern = "UD", "States"),
country = str_replace_all(country, pattern = "New Jersey", "States"),
country = str_replace_all(country, pattern = "Alaska", "States"),
country = str_replace_all(country, pattern = "N. America", "States"),
country = str_replace_all(country, pattern = "I don't know anymore", "States"),
country = str_replace_all(country, pattern = "!.*", ""),
country = str_replace_all(country, pattern = "States[sS|dD|aA]", "States"),
country = str_replace_all(country, pattern = "States oerica", "States"),
country = str_replace_all(country, pattern = "Statesof A", "States"),
country = str_replace_all(country, pattern = "United States", "States"),
country = str_replace_all(country, pattern = "The United States", "States"),
country = str_replace_all(country, pattern = "The States", "States"),
country = str_replace_all(country, pattern = "Statestates States", "States"),
country = str_replace_all(country, pattern = "Statestates States States", "States"),
country = str_replace_all(country, pattern = "Statestates", "States"),
country = str_replace_all(country, pattern = "States States", "States"),
country = str_replace_all(country, pattern = "^[uU]+[.]*[kK]+[.]*", "United Kingdom"),
country = str_replace_all(country, pattern = "^[uU][nited]+\\s[kK][iI][nN][dD][oO][mM]", "United Kingdom"),
country = str_replace_all(country, pattern = "^[uU][nN][iI][tT][eE][dD]+\\s[kK][iI][nN][gG][dD][oO][mM]", "United Kingdom"),
country = str_replace_all(country, pattern = "^[eE][nN][gG|dD][lL][aA][nN][dD]", "United Kingdom"),
country = str_replace_all(country, pattern = "Scotland", "United Kingdom"),
country = str_replace_all(country, pattern = "^[cC][a-zA-Z]{5}", "Canada"),
country = str_replace_all(country, pattern = "^[cC]+[aA]+[nN]+[aA]+[dD]+[aA]+[aA|iI|rR]*[aA|nN]*[iI]*[aA]*", "Canada"),
country = str_replace_all(country, pattern = "Canae", "Canada"),
country = str_replace_all(country, pattern = "^Canada`", "Canada"),
country = str_replace_all(country, pattern = "soviet canuckistan`", "Canada")
)
bound_country_clean %>%
distinct(country)
view(bound_country_clean)
Still and issue: # States? Hard to tell anymore.. # one “United
States” not changing (probably spaces issue??) # States (I think but
it’s an election year so who can really tell) # I pretend to be from
Canada, but I am really from the United States. # there isn’t one for
old men # one of ones (previously “one of the good ones”?) # The Yoo
Essaayyyyyy # this one # neverland # somewhere # god’s country # EUA #
See above # Not the or Canada (previously not the US or canada) # Denial
# Earth # insanity lately # A # Can # Atlantis # Narnia # 1 # subscribe
to dm4uz3 on youtube
# Fear and Loathing